In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from googletrans import Translator
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv('Amazon Sale Report.csv')
df.T
Out[2]:
0 1 2 3 4 5 6 7 8 9 ... 128965 128966 128967 128968 128969 128970 128971 128972 128973 128974
index 0 1 2 3 4 5 6 7 8 9 ... 128965 128966 128967 128968 128969 128970 128971 128972 128973 128974
Order ID 405-8078784-5731545 171-9198151-1101146 404-0687676-7273146 403-9615377-8133951 407-1069790-7240320 404-1490984-4578765 408-5748499-6859555 406-7807733-3785945 407-5443024-5233168 402-4393761-0311520 ... 408-5154281-4593912 406-9812666-2474761 404-5182288-1653947 403-7059995-7618722 404-3802633-7250760 406-6001380-7673107 402-9551604-7544318 407-9547469-3152358 402-6184140-0545956 408-7436540-8728312
Date 04-30-22 04-30-22 04-30-22 04-30-22 04-30-22 04-30-22 04-30-22 04-30-22 04-30-22 04-30-22 ... 05-31-22 05-31-22 05-31-22 05-31-22 05-31-22 05-31-22 05-31-22 05-31-22 05-31-22 05-31-22
Status Cancelled Shipped - Delivered to Buyer Shipped Cancelled Shipped Shipped Shipped Shipped - Delivered to Buyer Cancelled Shipped ... Cancelled Shipped Cancelled Shipped Cancelled Shipped Shipped Shipped Shipped Shipped
Fulfilment Merchant Merchant Amazon Merchant Amazon Amazon Amazon Merchant Amazon Amazon ... Amazon Amazon Amazon Amazon Amazon Amazon Amazon Amazon Amazon Amazon
Sales Channel Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in ... Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in
ship-service-level Standard Standard Expedited Standard Expedited Expedited Expedited Standard Expedited Expedited ... Expedited Expedited Expedited Expedited Expedited Expedited Expedited Expedited Expedited Expedited
Style SET389 JNE3781 JNE3371 J0341 JNE3671 SET264 J0095 JNE3405 SET200 JNE3461 ... J0119 SET224 JNE3638 SET264 SET044 JNE3697 SET401 J0157 J0012 J0003
SKU SET389-KR-NP-S JNE3781-KR-XXXL JNE3371-KR-XL J0341-DR-L JNE3671-TU-XXXL SET264-KR-NP-XL J0095-SET-L JNE3405-KR-S SET200-KR-NP-A-XXXL JNE3461-KR-XXL ... J0119-TP-XXXL SET224-KR-NP-XS JNE3638-KR-XS SET264-KR-NP-XL SET044-KR-NP-M JNE3697-KR-XL SET401-KR-NP-M J0157-DR-XXL J0012-SKD-XS J0003-SET-S
Category Set kurta kurta Western Dress Top Set Set kurta Set kurta ... Top Set kurta Set Set kurta Set Western Dress Set Set
Size S 3XL XL L 3XL XL L S 3XL XXL ... 3XL XS XS XL M XL M XXL XS S
ASIN B09KXVBD7Z B09K3WFS32 B07WV4JV4D B099NRCT7B B098714BZP B08YN7XDSG B08CMHNWBN B081WX4G4Q B08L91ZZXN B08B3XF5MH ... B08RYPRVPV B08MXDBRK1 B09814Q3QH B08YN7XDSG B07Q2RTSFB B098112V2V B09VC6KHX8 B0982YZ51B B0894Y2NJQ B0894X27FC
Courier Status NaN Shipped Shipped NaN Shipped Shipped Shipped Shipped Cancelled Shipped ... Unshipped Shipped Cancelled Shipped Unshipped Shipped Shipped Shipped Shipped Shipped
Qty 0 1 1 0 1 1 1 1 0 1 ... 1 1 0 1 1 1 1 1 1 1
currency INR INR INR INR INR INR INR INR NaN INR ... INR INR NaN INR INR INR INR INR INR INR
Amount 647.62 406.0 329.0 753.33 574.0 824.0 653.0 399.0 NaN 363.0 ... 574.0 1132.0 NaN 824.0 612.0 517.0 999.0 690.0 1199.0 696.0
ship-city MUMBAI BENGALURU NAVI MUMBAI PUDUCHERRY CHENNAI GHAZIABAD CHANDIGARH HYDERABAD HYDERABAD Chennai ... Prayagraj (ALLAHABAD) CHENNAI 600042 Kolkata Delhi MUMBAI HYDERABAD GURUGRAM HYDERABAD Halol Raipur
ship-state MAHARASHTRA KARNATAKA MAHARASHTRA PUDUCHERRY TAMIL NADU UTTAR PRADESH CHANDIGARH TELANGANA TELANGANA TAMIL NADU ... UTTAR PRADESH TAMIL NADU WEST BENGAL DELHI MAHARASHTRA TELANGANA HARYANA TELANGANA Gujarat CHHATTISGARH
ship-postal-code 400081.0 560085.0 410210.0 605008.0 600073.0 201102.0 160036.0 500032.0 500008.0 600041.0 ... 211007.0 600042.0 700040.0 110053.0 400017.0 500013.0 122004.0 500049.0 389350.0 492014.0
ship-country IN IN IN IN IN IN IN IN IN IN ... IN IN IN IN IN IN IN IN IN IN
promotion-ids NaN Amazon PLCC Free-Financing Universal Merchant ... IN Core Free Shipping 2015/04/08 23-48-5-108 NaN NaN IN Core Free Shipping 2015/04/08 23-48-5-108 IN Core Free Shipping 2015/04/08 23-48-5-108 Amazon PLCC Free-Financing Universal Merchant ... IN Core Free Shipping 2015/04/08 23-48-5-108 NaN ... NaN NaN NaN IN Core Free Shipping 2015/04/08 23-48-5-108 NaN NaN IN Core Free Shipping 2015/04/08 23-48-5-108 NaN IN Core Free Shipping 2015/04/08 23-48-5-108 IN Core Free Shipping 2015/04/08 23-48-5-108
B2B False False True False False False False False False False ... False False False False False False False False False False
fulfilled-by Easy Ship Easy Ship NaN Easy Ship NaN NaN NaN Easy Ship NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Unnamed: 22 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... False False False False False False False False False False

24 rows × 128975 columns

In [3]:
def sniff_modified(df):
    with pd.option_context("display.max_colwidth", 20):
        info = pd.DataFrame()
        info['data type'] = df.dtypes
        info['percent missing'] = df.isnull().sum()*100/len(df)
        info['No. unique'] = df.apply(lambda x: len(x.unique()))
        info['unique values'] = df.apply(lambda x: x.unique())
        return info.sort_values('data type')
In [4]:
sniff_modified(df)
Out[4]:
data type percent missing No. unique unique values
B2B bool 0.000000 2 [False, True]
index int64 0.000000 128975 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
Qty int64 0.000000 10 [0, 1, 2, 15, 3, 9, 13, 5, 4, 8]
ship-postal-code float64 0.025586 9460 [400081.0, 560085.0, 410210.0, 605008.0, 60007...
Amount float64 6.043807 1411 [647.62, 406.0, 329.0, 753.33, 574.0, 824.0, 6...
promotion-ids object 38.110487 5788 [nan, Amazon PLCC Free-Financing Universal Mer...
ship-country object 0.025586 2 [IN, nan]
ship-state object 0.025586 70 [MAHARASHTRA, KARNATAKA, PUDUCHERRY, TAMIL NAD...
ship-city object 0.025586 8956 [MUMBAI, BENGALURU, NAVI MUMBAI, PUDUCHERRY, C...
currency object 6.043807 2 [INR, nan]
Courier Status object 5.328164 4 [nan, Shipped, Cancelled, Unshipped]
ASIN object 0.000000 7190 [B09KXVBD7Z, B09K3WFS32, B07WV4JV4D, B099NRCT7...
Size object 0.000000 11 [S, 3XL, XL, L, XXL, XS, 6XL, M, 4XL, 5XL, Free]
Category object 0.000000 9 [Set, kurta, Western Dress, Top, Ethnic Dress,...
SKU object 0.000000 7195 [SET389-KR-NP-S, JNE3781-KR-XXXL, JNE3371-KR-X...
Style object 0.000000 1377 [SET389, JNE3781, JNE3371, J0341, JNE3671, SET...
ship-service-level object 0.000000 2 [Standard, Expedited]
Sales Channel object 0.000000 2 [Amazon.in, Non-Amazon]
Fulfilment object 0.000000 2 [Merchant, Amazon]
Status object 0.000000 13 [Cancelled, Shipped - Delivered to Buyer, Ship...
Date object 0.000000 91 [04-30-22, 04-29-22, 04-28-22, 04-27-22, 04-26...
Order ID object 0.000000 120378 [405-8078784-5731545, 171-9198151-1101146, 404...
fulfilled-by object 69.546811 2 [Easy Ship, nan]
Unnamed: 22 object 38.030626 2 [nan, False]
In [5]:
df['Date']=pd.to_datetime(df['Date'])
In [6]:
df.head().T
Out[6]:
0 1 2 3 4
index 0 1 2 3 4
Order ID 405-8078784-5731545 171-9198151-1101146 404-0687676-7273146 403-9615377-8133951 407-1069790-7240320
Date 2022-04-30 00:00:00 2022-04-30 00:00:00 2022-04-30 00:00:00 2022-04-30 00:00:00 2022-04-30 00:00:00
Status Cancelled Shipped - Delivered to Buyer Shipped Cancelled Shipped
Fulfilment Merchant Merchant Amazon Merchant Amazon
Sales Channel Amazon.in Amazon.in Amazon.in Amazon.in Amazon.in
ship-service-level Standard Standard Expedited Standard Expedited
Style SET389 JNE3781 JNE3371 J0341 JNE3671
SKU SET389-KR-NP-S JNE3781-KR-XXXL JNE3371-KR-XL J0341-DR-L JNE3671-TU-XXXL
Category Set kurta kurta Western Dress Top
Size S 3XL XL L 3XL
ASIN B09KXVBD7Z B09K3WFS32 B07WV4JV4D B099NRCT7B B098714BZP
Courier Status NaN Shipped Shipped NaN Shipped
Qty 0 1 1 0 1
currency INR INR INR INR INR
Amount 647.62 406.0 329.0 753.33 574.0
ship-city MUMBAI BENGALURU NAVI MUMBAI PUDUCHERRY CHENNAI
ship-state MAHARASHTRA KARNATAKA MAHARASHTRA PUDUCHERRY TAMIL NADU
ship-postal-code 400081.0 560085.0 410210.0 605008.0 600073.0
ship-country IN IN IN IN IN
promotion-ids NaN Amazon PLCC Free-Financing Universal Merchant ... IN Core Free Shipping 2015/04/08 23-48-5-108 NaN NaN
B2B False False True False False
fulfilled-by Easy Ship Easy Ship NaN Easy Ship NaN
Unnamed: 22 NaN NaN NaN NaN NaN
In [7]:
df.isnull().sum()
Out[7]:
index                     0
Order ID                  0
Date                      0
Status                    0
Fulfilment                0
Sales Channel             0
ship-service-level        0
Style                     0
SKU                       0
Category                  0
Size                      0
ASIN                      0
Courier Status         6872
Qty                       0
currency               7795
Amount                 7795
ship-city                33
ship-state               33
ship-postal-code         33
ship-country             33
promotion-ids         49153
B2B                       0
fulfilled-by          89698
Unnamed: 22           49050
dtype: int64
In [8]:
columns_to_delete = ['Unnamed: 22', 'fulfilled-by', 'promotion-ids', 'ship-postal-code', 'currency', 'ship-country','Sales Channel ', 'index', 'SKU']
df = df.drop(columns=columns_to_delete)
In [9]:
sniff_modified(df)
Out[9]:
data type percent missing No. unique unique values
Date datetime64[ns] 0.000000 91 [2022-04-30 00:00:00, 2022-04-29 00:00:00, 202...
B2B bool 0.000000 2 [False, True]
Qty int64 0.000000 10 [0, 1, 2, 15, 3, 9, 13, 5, 4, 8]
Amount float64 6.043807 1411 [647.62, 406.0, 329.0, 753.33, 574.0, 824.0, 6...
Order ID object 0.000000 120378 [405-8078784-5731545, 171-9198151-1101146, 404...
Status object 0.000000 13 [Cancelled, Shipped - Delivered to Buyer, Ship...
Fulfilment object 0.000000 2 [Merchant, Amazon]
ship-service-level object 0.000000 2 [Standard, Expedited]
Style object 0.000000 1377 [SET389, JNE3781, JNE3371, J0341, JNE3671, SET...
Category object 0.000000 9 [Set, kurta, Western Dress, Top, Ethnic Dress,...
Size object 0.000000 11 [S, 3XL, XL, L, XXL, XS, 6XL, M, 4XL, 5XL, Free]
ASIN object 0.000000 7190 [B09KXVBD7Z, B09K3WFS32, B07WV4JV4D, B099NRCT7...
Courier Status object 5.328164 4 [nan, Shipped, Cancelled, Unshipped]
ship-city object 0.025586 8956 [MUMBAI, BENGALURU, NAVI MUMBAI, PUDUCHERRY, C...
ship-state object 0.025586 70 [MAHARASHTRA, KARNATAKA, PUDUCHERRY, TAMIL NAD...
In [10]:
df.shape
Out[10]:
(128975, 15)
In [11]:
df_cleaned_rows = df.dropna()
sniff_modified(df_cleaned_rows)
Out[11]:
data type percent missing No. unique unique values
Date datetime64[ns] 0.0 91 [2022-04-30 00:00:00, 2022-04-29 00:00:00, 202...
B2B bool 0.0 2 [False, True]
Qty int64 0.0 6 [1, 2, 3, 5, 4, 8]
Amount float64 0.0 867 [406.0, 329.0, 574.0, 824.0, 653.0, 399.0, 363...
Order ID object 0.0 108271 [171-9198151-1101146, 404-0687676-7273146, 407...
Status object 0.0 12 [Shipped - Delivered to Buyer, Shipped, Cancel...
Fulfilment object 0.0 2 [Merchant, Amazon]
ship-service-level object 0.0 2 [Standard, Expedited]
Style object 0.0 1371 [JNE3781, JNE3371, JNE3671, SET264, J0095, JNE...
Category object 0.0 9 [kurta, Top, Set, Western Dress, Ethnic Dress,...
Size object 0.0 11 [3XL, XL, L, S, XXL, XS, 6XL, M, 4XL, 5XL, Free]
ASIN object 0.0 7128 [B09K3WFS32, B07WV4JV4D, B098714BZP, B08YN7XDS...
Courier Status object 0.0 2 [Shipped, Unshipped]
ship-city object 0.0 8499 [BENGALURU, NAVI MUMBAI, CHENNAI, GHAZIABAD, C...
ship-state object 0.0 67 [KARNATAKA, MAHARASHTRA, TAMIL NADU, UTTAR PRA...
In [12]:
df_cleaned_rows.shape
Out[12]:
(116016, 15)
In [13]:
## !pip install googletrans==4.0.0-rc1
In [14]:
translator = Translator()

def translate_to_english(text):
    if text.isascii():
        return text
    else:
        translation = translator.translate(text, src='hi', dest='en')
        return translation.text

df_cleaned_rows['ship-city'] = df_cleaned_rows['ship-city'].apply(translate_to_english)
In [15]:
df_cleaned_rows['ship-city']=df_cleaned_rows['ship-city'].str.capitalize()
In [16]:
df_cleaned_rows['ship-city']=df_cleaned_rows['ship-city'].replace({'Guwahati, kamrup (m)': 'Guwahati',
                                                                   'Kolkata 700034':'Kolkata',
                                                                   'Dhamnod dhar district': 'Dhamnod dhar',
                                                                   'Nk sweets vikasnagar':'vikasnagar',
                                                                   'Mandvi kachchh district':'Mandvi kachchh',
                                                                   'Andul near maya stores':'Andul',
                                                                   'district': '',
                                                                   'Neelakudi, thiruvarur': 'Neelakudi',
                                                                   'Arvi,dist- wardha':'Arvi',
                                                                   'Mumbai 400101': 'Mumbai',
                                                                   'Kamatgi,hunugund taluk,bagalkot district':'bagalkot',
                                                                   'extension': '',
                                                                   'Durganagar ,nimta south':'Durganagar',
                                                                   'and  ':'',
                                                                   'Mumbai,malad west,malvani.':'malvani',
                                                                   'Hassan (amazon arun)':'Hassan',
                                                                   'Phanigiri road,chaitanyapuri,hyderabad':'chaitanyapuri',
                                                                   'Town.budhana village. husainpur kalan':'husainpur',
                                                                   'New delhi-110075':'New delhi',
                                                                   'Mumbai -400064':'Mumbai',
                                                                   'Mumbai 400023':'Mumbai',
                                                                   'Post office-harrawala, dehradun':'dehradun',
                                                                   'Nuvem.   ( do not ring the door bell, call b4 comg':'Nuvem',
                                                                   'Tehsil - sikandrabad, district - bulandshahr':'bulandshahr',
                                                                   'Mumbai 400057':'Mumbai',
                                                                   'Pune-412207':'Pune'
                                                                    })
In [17]:
df_cleaned_rows['ship-city'].unique()
Out[17]:
array(['Bengaluru', 'Navi mumbai', 'Chennai', ...,
       'Vaishali nagar nagpur', 'Gulabpura, distt bhilwara',
       'Prayagraj (allahabad)'], dtype=object)
In [18]:
df_cleaned_rows['ship-state']=df_cleaned_rows['ship-state'].str.capitalize()
In [19]:
df_cleaned_rows['ship-state']=df_cleaned_rows['ship-state'].replace({'Punjab/mohali/zirakpur': 'Punjab',
                                                                     'rajshthan': 'Rajasthan',
                                                                    'Puducherry':'Pondicherry',
                                                                    'Pb': 'Punjab',
                                                                    'Ar': 'Arunachal pradesh',
                                                                    'Nl': 'Nagaland',
                                                                    'Rj': 'Rajasthan',
                                                                    'Orissa':'Odisha',
                                                                     'Rajsthan': 'Rajasthan',
                                                                     'New delhi': 'Delhi'
                                                                    })
In [20]:
df_cleaned_rows['ship-state'].unique()
Out[20]:
array(['Karnataka', 'Maharashtra', 'Tamil nadu', 'Uttar pradesh',
       'Chandigarh', 'Telangana', 'Andhra pradesh', 'Rajasthan', 'Delhi',
       'Haryana', 'Assam', 'Jharkhand', 'Chhattisgarh', 'Odisha',
       'Kerala', 'Madhya pradesh', 'West bengal', 'Nagaland', 'Gujarat',
       'Uttarakhand', 'Bihar', 'Jammu & kashmir', 'Punjab',
       'Himachal pradesh', 'Arunachal pradesh', 'Manipur', 'Goa',
       'Meghalaya', 'Pondicherry', 'Tripura', 'Ladakh', 'Dadra and nagar',
       'Sikkim', 'Andaman & nicobar ', 'Rajshthan', 'Mizoram',
       'Lakshadweep'], dtype=object)
In [21]:
df_cleaned_rows['Order ID'] = df_cleaned_rows['Order ID'].apply(lambda x: re.sub(r'\D', '', str(x)))

df_cleaned_rows['Order ID'] = pd.to_numeric(df_cleaned_rows['Order ID'], errors='coerce')
In [22]:
df_cleaned_rows.shape
Out[22]:
(116016, 15)
In [23]:
# df_cleaned_rows.to_excel('marketing.xlsx')

Top 5 products (using Size) sold by Qty¶

In [24]:
top5_size = df_cleaned_rows.groupby('Size')['Qty'].sum().sort_values(ascending = False).head(5)
top5_size
Out[24]:
Size
M      20422
L      19963
XL     18898
XXL    16487
S      15309
Name: Qty, dtype: int64

Top 5 state (using ship-state) sold by Amount¶

In [25]:
top5_city = df_cleaned_rows.groupby('ship-state')['Amount'].sum().sort_values(ascending = False).head(5)
top5_city
Out[25]:
ship-state
Maharashtra      12864511.0
Karnataka        10153100.0
Telangana         6642955.0
Uttar pradesh     6494393.0
Tamil nadu        6241913.0
Name: Amount, dtype: float64
In [26]:
# df_cleaned_rows.to_csv('output1.csv', index=False)
In [27]:
categorical_cols= ['B2B','Fulfilment','ship-service-level','Courier Status', 'Size', 'Status', 'Category', 'Qty', 'Style','ASIN','ship-city','ship-state' ]
In [28]:
categorical_cols
Out[28]:
['B2B',
 'Fulfilment',
 'ship-service-level',
 'Courier Status',
 'Size',
 'Status',
 'Category',
 'Qty',
 'Style',
 'ASIN',
 'ship-city',
 'ship-state']
In [29]:
all_cols = ['Order ID','Amount']
In [30]:
all_cols
Out[30]:
['Order ID', 'Amount']
In [31]:
for col in all_cols:
    fig, ax = plt.subplots(figsize=(10, 6))
    df_cleaned_rows[col].hist(bins=10)
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()
    print(df_cleaned_rows[col].mean())
    print(df_cleaned_rows[col].median())
3.768420902031196e+16
4.050856516269754e+16
649.7808491932147
606.0
In [32]:
for col in categorical_cols:
    plt.figure(figsize=(15, 6))
    
    # Plot only the top 10 categories for better readability
    top_n = 20
    df_cleaned_rows[col].value_counts().nlargest(top_n).plot(kind='bar', color='skyblue')
    
    plt.title(f'Bar Plot of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.xticks(rotation=90)
    plt.show()

    dominating_category = df_cleaned_rows[col].value_counts().idxmax()
    print(f"In column '{col}', the dominating category is '{dominating_category}' with {df_cleaned_rows[col].value_counts().max()} occurrences.\n")
In column 'B2B', the dominating category is 'False' with 115191 occurrences.

In column 'Fulfilment', the dominating category is 'Amazon' with 83621 occurrences.

In column 'ship-service-level', the dominating category is 'Expedited' with 82705 occurrences.

In column 'Courier Status', the dominating category is 'Shipped' with 109461 occurrences.

In column 'Size', the dominating category is 'M' with 20339 occurrences.

In column 'Status', the dominating category is 'Shipped' with 77580 occurrences.

In column 'Category', the dominating category is 'Set' with 45079 occurrences.

In column 'Qty', the dominating category is '1' with 115637 occurrences.

In column 'Style', the dominating category is 'JNE3797' with 3676 occurrences.

In column 'ASIN', the dominating category is 'B09SDXFFQ1' with 660 occurrences.

In column 'ship-city', the dominating category is 'Bengaluru' with 11004 occurrences.

In column 'ship-state', the dominating category is 'Maharashtra' with 20273 occurrences.

In [33]:
df_cleaned_rows['RecencyDate'] = df_cleaned_rows['Date']

RFM Analysis by Ship-State¶

In [34]:
current_date = df_cleaned_rows['Date'].max()

rfm_data = df_cleaned_rows.groupby('ship-state').agg({
    'RecencyDate': lambda x: (current_date - x.max()).days,  # Recency
    'Date': 'count',                                   # Frequency
    'Amount': 'sum'                                      # Monetary
}).rename(columns={
    'RecencyDate': 'Recency',
    'Date': 'Frequency',
    'Amount': 'Monetary'
})

rfm_data
Out[34]:
Recency Frequency Monetary
ship-state
Andaman & nicobar 0 223 146093.0
Andhra pradesh 0 4795 3049151.0
Arunachal pradesh 1 135 94865.0
Assam 0 1482 976951.0
Bihar 0 1868 1351529.0
Chandigarh 0 304 203354.0
Chhattisgarh 0 820 541811.0
Dadra and nagar 3 58 39276.0
Delhi 0 6385 4231084.0
Goa 0 1051 622220.0
Gujarat 0 3971 2618903.0
Haryana 0 4043 2791446.0
Himachal pradesh 1 675 470548.0
Jammu & kashmir 0 614 432712.0
Jharkhand 0 1279 868953.0
Karnataka 0 15844 10153100.0
Kerala 0 5793 3618505.0
Ladakh 1 40 36737.0
Lakshadweep 23 3 2441.0
Madhya pradesh 0 2258 1525537.0
Maharashtra 0 20273 12864511.0
Manipur 1 294 210922.0
Meghalaya 0 184 111234.0
Mizoram 1 67 38503.0
Nagaland 0 172 140129.0
Odisha 0 1904 1324830.0
Pondicherry 0 308 182198.0
Punjab 0 1732 1163716.0
Rajasthan 0 2424 1682199.0
Rajshthan 61 2 1126.0
Sikkim 0 182 134847.0
Tamil nadu 0 10345 6241913.0
Telangana 0 10208 6642955.0
Tripura 0 134 86799.0
Uttar pradesh 0 9462 6494393.0
Uttarakhand 0 1383 932314.0
West bengal 0 5301 3357170.0
In [35]:
scaler = StandardScaler()
normalized_rfm = pd.DataFrame(scaler.fit_transform(rfm_data), columns=rfm_data.columns)
print(normalized_rfm)
     Recency  Frequency  Monetary
0  -0.238096  -0.631746 -0.642679
1  -0.238096   0.359937  0.343783
2  -0.142340  -0.650833 -0.660086
3  -0.238096  -0.358664 -0.360353
4  -0.238096  -0.274940 -0.233071
5  -0.238096  -0.614177 -0.623222
6  -0.238096  -0.502255 -0.508214
7   0.049172  -0.667535 -0.678976
8  -0.238096   0.704813  0.745406
9  -0.238096  -0.452150 -0.480891
10 -0.238096   0.181208  0.197585
11 -0.238096   0.196825  0.256215
12 -0.142340  -0.533706 -0.532429
13 -0.238096  -0.546937 -0.545286
14 -0.238096  -0.402696 -0.397050
15 -0.238096   2.756503  2.757714
16 -0.238096   0.576406  0.537251
17 -0.142340  -0.671439 -0.679838
18  1.964292  -0.679465 -0.691492
19 -0.238096  -0.190347 -0.173942
20 -0.238096   3.717168  3.679054
21 -0.142340  -0.616346 -0.620650
22 -0.238096  -0.640205 -0.654524
23 -0.142340  -0.665583 -0.679238
24 -0.238096  -0.642808 -0.644706
25 -0.238096  -0.267131 -0.242143
26 -0.238096  -0.613309 -0.630411
27 -0.238096  -0.304439 -0.296890
28 -0.238096  -0.154341 -0.120709
29  5.603020  -0.679682 -0.691939
30 -0.238096  -0.640639 -0.646501
31 -0.238096   1.563751  1.428688
32 -0.238096   1.534035  1.564962
33 -0.238096  -0.651050 -0.662827
34 -0.238096   1.372225  1.514481
35 -0.238096  -0.380138 -0.375520
36 -0.238096   0.469690  0.448449
In [36]:
rfm_data
Out[36]:
Recency Frequency Monetary
ship-state
Andaman & nicobar 0 223 146093.0
Andhra pradesh 0 4795 3049151.0
Arunachal pradesh 1 135 94865.0
Assam 0 1482 976951.0
Bihar 0 1868 1351529.0
Chandigarh 0 304 203354.0
Chhattisgarh 0 820 541811.0
Dadra and nagar 3 58 39276.0
Delhi 0 6385 4231084.0
Goa 0 1051 622220.0
Gujarat 0 3971 2618903.0
Haryana 0 4043 2791446.0
Himachal pradesh 1 675 470548.0
Jammu & kashmir 0 614 432712.0
Jharkhand 0 1279 868953.0
Karnataka 0 15844 10153100.0
Kerala 0 5793 3618505.0
Ladakh 1 40 36737.0
Lakshadweep 23 3 2441.0
Madhya pradesh 0 2258 1525537.0
Maharashtra 0 20273 12864511.0
Manipur 1 294 210922.0
Meghalaya 0 184 111234.0
Mizoram 1 67 38503.0
Nagaland 0 172 140129.0
Odisha 0 1904 1324830.0
Pondicherry 0 308 182198.0
Punjab 0 1732 1163716.0
Rajasthan 0 2424 1682199.0
Rajshthan 61 2 1126.0
Sikkim 0 182 134847.0
Tamil nadu 0 10345 6241913.0
Telangana 0 10208 6642955.0
Tripura 0 134 86799.0
Uttar pradesh 0 9462 6494393.0
Uttarakhand 0 1383 932314.0
West bengal 0 5301 3357170.0

Use KMeans clustering to segment customers into different groups.¶

In [37]:
wcss = []
for i in range(1, 7):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(normalized_rfm)
    wcss.append(kmeans.inertia_)

# Plot the Elbow method
plt.figure(figsize=(10,5))
plt.plot(range(1, 7), wcss, marker='o', linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('K-means clustering Elbow Method')
plt.show()



optimal_clusters = 2 
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(normalized_rfm)

rfm_data['Cluster'] = clusters
In [38]:
kmeans = KMeans(n_clusters = 2)
y_kmeans = kmeans.fit_predict(normalized_rfm)

rfm_data['Cluster_ID'] = y_kmeans

rfm_data
Out[38]:
Recency Frequency Monetary Cluster Cluster_ID
ship-state
Andaman & nicobar 0 223 146093.0 0 1
Andhra pradesh 0 4795 3049151.0 0 1
Arunachal pradesh 1 135 94865.0 0 1
Assam 0 1482 976951.0 0 1
Bihar 0 1868 1351529.0 0 1
Chandigarh 0 304 203354.0 0 1
Chhattisgarh 0 820 541811.0 0 1
Dadra and nagar 3 58 39276.0 0 1
Delhi 0 6385 4231084.0 0 1
Goa 0 1051 622220.0 0 1
Gujarat 0 3971 2618903.0 0 1
Haryana 0 4043 2791446.0 0 1
Himachal pradesh 1 675 470548.0 0 1
Jammu & kashmir 0 614 432712.0 0 1
Jharkhand 0 1279 868953.0 0 1
Karnataka 0 15844 10153100.0 1 0
Kerala 0 5793 3618505.0 0 1
Ladakh 1 40 36737.0 0 1
Lakshadweep 23 3 2441.0 0 1
Madhya pradesh 0 2258 1525537.0 0 1
Maharashtra 0 20273 12864511.0 1 0
Manipur 1 294 210922.0 0 1
Meghalaya 0 184 111234.0 0 1
Mizoram 1 67 38503.0 0 1
Nagaland 0 172 140129.0 0 1
Odisha 0 1904 1324830.0 0 1
Pondicherry 0 308 182198.0 0 1
Punjab 0 1732 1163716.0 0 1
Rajasthan 0 2424 1682199.0 0 1
Rajshthan 61 2 1126.0 0 1
Sikkim 0 182 134847.0 0 1
Tamil nadu 0 10345 6241913.0 1 0
Telangana 0 10208 6642955.0 1 0
Tripura 0 134 86799.0 0 1
Uttar pradesh 0 9462 6494393.0 1 0
Uttarakhand 0 1383 932314.0 0 1
West bengal 0 5301 3357170.0 0 1
In [39]:
facet = sns.lmplot(rfm_data, x='Frequency', y='Monetary', hue='Cluster_ID', 
                   fit_reg=False, legend=True, legend_out=True)
In [40]:
import plotly.express as px

fig = px.scatter_3d(rfm_data, x='Frequency', y='Monetary', z='Recency', color='Cluster_ID', hover_name = rfm_data.index )
fig.show()
In [41]:
rfm_data[rfm_data['Cluster_ID'] == 1]
Out[41]:
Recency Frequency Monetary Cluster Cluster_ID
ship-state
Andaman & nicobar 0 223 146093.0 0 1
Andhra pradesh 0 4795 3049151.0 0 1
Arunachal pradesh 1 135 94865.0 0 1
Assam 0 1482 976951.0 0 1
Bihar 0 1868 1351529.0 0 1
Chandigarh 0 304 203354.0 0 1
Chhattisgarh 0 820 541811.0 0 1
Dadra and nagar 3 58 39276.0 0 1
Delhi 0 6385 4231084.0 0 1
Goa 0 1051 622220.0 0 1
Gujarat 0 3971 2618903.0 0 1
Haryana 0 4043 2791446.0 0 1
Himachal pradesh 1 675 470548.0 0 1
Jammu & kashmir 0 614 432712.0 0 1
Jharkhand 0 1279 868953.0 0 1
Kerala 0 5793 3618505.0 0 1
Ladakh 1 40 36737.0 0 1
Lakshadweep 23 3 2441.0 0 1
Madhya pradesh 0 2258 1525537.0 0 1
Manipur 1 294 210922.0 0 1
Meghalaya 0 184 111234.0 0 1
Mizoram 1 67 38503.0 0 1
Nagaland 0 172 140129.0 0 1
Odisha 0 1904 1324830.0 0 1
Pondicherry 0 308 182198.0 0 1
Punjab 0 1732 1163716.0 0 1
Rajasthan 0 2424 1682199.0 0 1
Rajshthan 61 2 1126.0 0 1
Sikkim 0 182 134847.0 0 1
Tripura 0 134 86799.0 0 1
Uttarakhand 0 1383 932314.0 0 1
West bengal 0 5301 3357170.0 0 1

Advanced Machine Learning Analysis¶

Feature Engineering¶

In [42]:
label_encoder = LabelEncoder()
df_cleaned_rows[categorical_cols]=df_cleaned_rows[categorical_cols].apply(LabelEncoder().fit_transform)
In [43]:
min_max_scaler = MinMaxScaler()
df_cleaned_rows[all_cols] = min_max_scaler.fit_transform(df_cleaned_rows[all_cols])
In [44]:
df_cleaned_rows
Out[44]:
Order ID Date Status Fulfilment ship-service-level Style Category Size ASIN Courier Status Qty Amount ship-city ship-state B2B RecencyDate
1 0.003865 2022-04-30 5 1 1 845 8 0 5229 0 0 0.072708 764 15 0 2022-04-30
2 0.979281 2022-04-30 3 0 0 531 8 8 666 0 0 0.058918 4412 20 1 2022-04-30
4 0.992046 2022-04-30 3 0 0 752 6 0 4308 0 0 0.102794 1237 31 0 2022-04-30
5 0.979618 2022-04-30 3 0 0 1230 5 8 2904 0 0 0.147564 2054 34 0 2022-04-30
6 0.998214 2022-04-30 3 0 0 180 5 5 1656 0 0 0.116941 1155 5 0 2022-04-30
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
128970 0.989917 2022-05-31 3 0 0 772 8 8 3920 0 0 0.092586 2405 32 0 2022-05-31
128971 0.974602 2022-05-31 3 0 0 1346 5 6 6780 0 0 0.178904 2209 11 0 2022-05-31
128972 0.995608 2022-05-31 3 0 0 236 7 10 4031 0 0 0.123567 2405 32 0 2022-05-31
128973 0.973187 2022-05-31 3 0 0 119 5 9 1300 0 0 0.214721 2257 10 0 2022-05-31
128974 0.998923 2022-05-31 3 0 0 110 5 7 1258 0 0 0.124642 5309 6 0 2022-05-31

116016 rows × 16 columns

In [45]:
id_col= ['Order ID']
drop_col= ['Courier Status','B2B','Date','RecencyDate']
target_col = ['Status']
In [46]:
X = df_cleaned_rows.drop(target_col, axis=1)
X = df_cleaned_rows.drop(drop_col, axis=1)
y=df_cleaned_rows[target_col]
In [47]:
X.shape, y.shape
Out[47]:
((116016, 12), (116016, 1))
In [48]:
X
Out[48]:
Order ID Status Fulfilment ship-service-level Style Category Size ASIN Qty Amount ship-city ship-state
1 0.003865 5 1 1 845 8 0 5229 0 0.072708 764 15
2 0.979281 3 0 0 531 8 8 666 0 0.058918 4412 20
4 0.992046 3 0 0 752 6 0 4308 0 0.102794 1237 31
5 0.979618 3 0 0 1230 5 8 2904 0 0.147564 2054 34
6 0.998214 3 0 0 180 5 5 1656 0 0.116941 1155 5
... ... ... ... ... ... ... ... ... ... ... ... ...
128970 0.989917 3 0 0 772 8 8 3920 0 0.092586 2405 32
128971 0.974602 3 0 0 1346 5 6 6780 0 0.178904 2209 11
128972 0.995608 3 0 0 236 7 10 4031 0 0.123567 2405 32
128973 0.973187 3 0 0 119 5 9 1300 0 0.214721 2257 10
128974 0.998923 3 0 0 110 5 7 1258 0 0.124642 5309 6

116016 rows × 12 columns

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.20)

Logistic Regression¶

In [50]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
In [51]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('\nClassification Report:')
print(classification_rep)
Accuracy: 0.6757

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1095
           1       0.00      0.00      0.00       153
           2       0.00      0.00      0.00        60
           3       0.68      1.00      0.81     15679
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00      5566
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00       205
           9       0.00      0.00      0.00         5
          10       0.00      0.00      0.00       405
          11       0.00      0.00      0.00        30

    accuracy                           0.68     23204
   macro avg       0.06      0.09      0.07     23204
weighted avg       0.46      0.68      0.54     23204

Random Forest¶

In [52]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
In [53]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('\nClassification Report:')
print(classification_rep)
Accuracy: 0.9994

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1095
           1       1.00      1.00      1.00       153
           2       1.00      1.00      1.00        60
           3       1.00      1.00      1.00     15679
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00      5566
           7       1.00      0.40      0.57         5
           8       0.96      1.00      0.98       205
           9       0.00      0.00      0.00         5
          10       0.99      1.00      0.99       405
          11       1.00      0.80      0.89        30

    accuracy                           1.00     23204
   macro avg       0.81      0.75      0.77     23204
weighted avg       1.00      1.00      1.00     23204